package com.spider.processor;

import com.spider.entites.Filed;
import com.spider.entites.TaskSetting;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.List;

/**
 * 页面爬取
 * 主要定义爬取的路径和字段名称，以及主页设置
 *
 * @author panglin
 * @date 2020/11/20 21:01
 */
@Component
@Scope("prototype")
public class DetailPageProcessor implements PageProcessor {

    private TaskSetting setting;

    private List<Filed> filed;

    public PageProcessor initPageProcessor(TaskSetting setting,List<Filed> fileds){
        this.setting = setting;
        this.filed = fileds;
        return this;
    }

    @Override
    public void process(Page page) {
        this.filed.forEach(f -> page.putField(f.getFiled(), page.getHtml().xpath(f.getXpath()).get()));
    }

    @Override
    public Site getSite() {
        return Site.me().setTimeOut(this.setting.getTimeOut()).setSleepTime(this.setting.getSleepTime()).setRetryTimes(this.setting.getRetryTime());
    }
}
